#!pip install beautifulsoup4
#!pip install requests
#!pip install urllib2
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import re
import plotly.graph_objects as go
import plotly.express as px
UNITS = {'s':'seconds', 'm':'minutes', 'h':'hours', 'd':'days', 'w':'weeks'}
from plotly.offline import init_notebook_mode, iplot
def convert_to_seconds(s):
return int(timedelta(**{
UNITS.get(m.group('unit').lower(), 'seconds'): int(m.group('val'))
for m in re.finditer(r'(?P<val>\d+)(?P<unit>[smhdw]?)', s, flags=re.I)
}).total_seconds()/60)
init_notebook_mode()
import plotly.io as pio
pio.renderers.default = "notebook"
from bs4 import BeautifulSoup, SoupStrainer
import requests
url = "http://www.zoolert.com/"
page = requests.get(url)
data = page.text
soup = BeautifulSoup(data)
linkal=[]
for link in soup.find_all('a'):
linkal.append(link.get('href'))
linkal
link_home=[]
for link in linkal:
if re.findall('/home/', link):
link_home.append(link)
link_home= set(link_home)
link_home= pd.DataFrame(list(link_home), columns= list(['url']))
link_home['prod_name'] = link_home['url'].str.split('/').str[-2]
link_home.to_csv('product_name_home.csv')
link_home[1:5]
link_home = pd.read_csv('product_name_home.csv')
def get_histchange( prod_name):
URL = 'https://www.zoolert.com/home/'+ prod_name+ '/instockhistory.php'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id=='div')
lists = results.find_all('tbody')
for tr in lists:
a = [td for td in tr.stripped_strings]
nrow = int(len(a)/4)
df = pd.DataFrame(np.array(a).reshape(nrow,4), columns = list(['retailer', 'name', 'time_update', 'duration']))
df['price']= df["name"].str.split("--", n = 1, expand = True)[1]
df['product'] = prod_name
print(prod_name)
return df;
history_df= pd.DataFrame()
temp = pd.DataFrame()
for prod_name in link_home["prod_name"]:
temp= get_histchange(prod_name)
history_df = pd.concat([history_df, temp])
history_df[1:10]
history_df.groupby(['product']).size()
date= datetime.date(datetime.now())
history_df.to_csv('home_hist_'+str(date)+'.csv')
date= datetime.date(datetime.now())
history_df= pd.read_csv('home_hist_'+str(date)+'.csv')
history_df['time']= pd.to_datetime(history_df['time_update']).dt.time
history_df['date']= pd.to_datetime(history_df['time_update']).dt.date
history_df['duration'] = history_df['duration'].apply(convert_to_seconds)
history_df['weekday']= pd.to_datetime(history_df['date']).dt.day_name()
history_df['weekday']= pd.Categorical(history_df['weekday'], categories=
['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
ordered=True)
history_df['hour']=pd.to_datetime(history_df['time_update']).dt.hour
history_df['time']
The Zoolert.com keeps restock data for each products up to 600 data points. To make this a fair game, I subset the data to last week and zoom onto products that I think people care mostly about during the COV19.
history_df1=history_df[history_df['date'] > pd.Timestamp(2020,4,19)]
history_df1=history_df1[ (history_df1['product']=='disinfecting-wipes' )|
( history_df1['product']=='antibacterial-hand-soap')|
( history_df1['product']=='emergen_c_vitamin')|
( history_df1['product']=='oximeter') |
( history_df1['product']=='disposable-laex-gloves') |
( history_df1['product']=='isopropyl-alcohol') |
( history_df1['product']=='all-purpose-cleaner') ]
The graph summarises the total number of restocks for each product across retailers by the day of week. It looks like Wednesday is a good day for shopping.
df_weekday = history_df1.groupby(['product', 'weekday'])['product'].count()
df_weekday= pd.DataFrame(df_weekday)
df_weekday= df_weekday.add_suffix('_Count').reset_index()
fig = px.area(df_weekday,
x='weekday', y ='product_Count', color ='product')
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.update_layout(title_text='Number of Restocks by Day of Week',
yaxis=dict(title='Number of Restocks'),
xaxis=dict(title='Day of Week'),
font = dict(size=16))
fig.show()
Next I want to see when does the restock happen throughout the day. It seems lower restock chance during the early morning.
df_weekday = history_df1.groupby([ 'hour', 'product'])['product'].count()
df_weekday= pd.DataFrame(df_weekday)
df_weekday= df_weekday.add_suffix('_Count').reset_index()
fig = px.area(df_weekday,x='hour', y ='product_Count', color = 'product')
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.update_xaxes(tick0 = 0, dtick = 2)
fig.update_layout(title_text='Number of Restocks by Hour of Day',
yaxis=dict(title='Count'),xaxis=dict(title='Hour of the day'),
font = dict(size=16))
fig.show()
Zoolert also tells you how long the restock last until it depletes. This plot summarises the average duration (in minutes) for each products and day of week.
df_weekday = history_df1.groupby(['product', 'weekday'])['duration'].mean()
df_weekday= pd.DataFrame(df_weekday)
df_weekday= df_weekday.add_suffix('').reset_index()
fig = px.bar(df_weekday, color ='product',height=900,
log_y=True, facet_col='product',facet_col_wrap=2,
x='weekday', y ='duration')
fig.update_layout(title_text='Duration until Restock Depletes ', showlegend=False,
font = dict(size=16))
fig.show()
df_weekday = history_df1.groupby(['retailer', 'weekday'])['duration'].mean()
df_weekday= pd.DataFrame(df_weekday)
df_weekday= df_weekday.add_suffix('').reset_index()
df_weekday = df_weekday[ (df_weekday['retailer'] =='Amazon' )|
(df_weekday['retailer'] =='BestBuy') |
(df_weekday['retailer'] =='Target') |
( df_weekday['retailer'] =='Walmart' )]
fig = px.bar(df_weekday,
facet_col='retailer', y ='duration', x='weekday', color= 'retailer',facet_col_wrap=3)
#fig.update_layout(barmode='group')
fig.update_yaxes(matches=None)
fig.update_layout(title_text='Duration until Restock Depletes ', showlegend=False,
yaxis=dict(title=''), font = dict(size=16))
fig.show()
history_df2=history_df1[ (history_df1['product']=='all-purpose-cleaner' )]
for axis in fig.layout:
if type(fig.layout[axis]) == go.layout.YAxis:
fig.layout[axis].title.text = ''
df_weekday = history_df2.groupby(['retailer', 'weekday', 'hour'])['product'].count()
df_weekday= pd.DataFrame(df_weekday)
df_weekday= df_weekday.add_suffix('').reset_index()
fig = px.bar(df_weekday,facet_row = 'weekday',height=1500,
facet_col='retailer', y ='product', x='hour', color= 'retailer')
#fig.update_layout(barmode='group')
#fig.update_yaxes(matches=None)
fig.update_layout(title_text='Number of restocks by retailer: all-purpose cleaner', showlegend=False,
yaxis=dict(title='', ), font = dict(size=16))
fig.show()